In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.offline as pyo
import plotly.io as pio
import plotly.graph_objs as go
from datetime import date
from urllib.request import urlopen
import json
from jupyter_dash import JupyterDash
from dash.dependencies import Input, Output
import dash_html_components as html
import dash_core_components as dcc
import warnings
warnings.filterwarnings('ignore')
In [2]:
from IPython.display import HTML
#https://stackoverflow.com/questions/27934885/how-to-hide-code-from-cells-in-ipython-notebook-visualized-with-nbviewer
In [3]:
HTML('''<script>
code_show=true; 
function code_toggle() {
 if (code_show){
 $('div.input').hide();
 } else {
 $('div.input').show();
 }
 code_show = !code_show
} 
$( document ).ready(code_toggle);
</script>

<style>
.output_png {
    display: table-cell;
    text-align: center;
    vertical-align: middle;
}
</style>
<form action="javascript:code_toggle()"><input type="submit" value="Click here to toggle on/off the code."></form>''')
Out[3]:

Title Page COVID Image

Scrum Planning

Scrum Planning

Best Practices

Best Practices

EDA

Best Practices

In [4]:
"""
Function to web scrape most up to date data from usafacts website so that the code
is always up to date with the most recent COVID-19 information
"""
def scrape_live_data():
    
    # Load datasets dynamically from URLs
    confirm_df = pd.read_csv('https://usafactsstatic.blob.core.windows.net/public/data/covid-19/covid_confirmed_usafacts.csv', header=0)
    # , encoding = "ISO-8859-1"
    pop_df = pd.read_csv('https://usafactsstatic.blob.core.windows.net/public/data/covid-19/covid_county_population_usafacts.csv', header=0)
    
    # Combine into one dataset
    confirmed_cases = pd.merge(left=confirm_df, right=pop_df, how='left', left_on=['countyFIPS','State'], right_on=['countyFIPS','State'])

    # Rename/Drop/Move columns
    # get list of column names 
    all_columns = confirmed_cases.columns.values
    # rename county Name_x with county name
    all_columns[1] = 'County Name'
    # make updated column names list columns of dataframe
    confirmed_cases.columns = all_columns
    # drop additional county names column added on merge
    confirmed_cases.drop(labels=['County Name_y'], axis=1, inplace = True)
    # save the population column so we can add back into df later
    population_column = confirmed_cases['population']
    # drop population column from dataframe
    confirmed_cases.drop(labels=['population'], axis=1, inplace = True)
    # insert population column back in as 3rd column in df
    confirmed_cases.insert(3, 'population', population_column)
    
    indexes = confirmed_cases.loc[[1835, 1862, 1863]].index
    confirmed_cases.drop(indexes, inplace = True)


    return confirmed_cases


def scrape_live_data_deaths():
    
    # Load datasets dynamically from URLs
    death_df = pd.read_csv('https://usafactsstatic.blob.core.windows.net/public/data/covid-19/covid_deaths_usafacts.csv', header=0)
    # , encoding = "ISO-8859-1"
    pop_df = pd.read_csv('https://usafactsstatic.blob.core.windows.net/public/data/covid-19/covid_county_population_usafacts.csv', header=0)
    
    # Combine into one dataset
    confirmed_deaths = pd.merge(left=death_df, right=pop_df, how='left', left_on=['countyFIPS','State'], right_on=['countyFIPS','State'])


    # Rename/Drop/Move columns
    # get list of column names 
    all_columns = confirmed_deaths.columns.values
    # rename county Name_x with county name
    all_columns[1] = 'County Name'
    # make updated column names list columns of dataframe
    confirmed_deaths.columns = all_columns
    # drop additional county names column added on merge
    confirmed_deaths.drop(labels=['County Name_y'], axis=1, inplace = True)
    # save the population column so we can add back into df later
    population_column = confirmed_deaths['population']
    # drop population column from dataframe
    confirmed_deaths.drop(labels=['population'], axis=1, inplace = True)
    # insert population column back in as 3rd column in df
    confirmed_deaths.insert(3, 'population', population_column)
    
    indexes = confirmed_deaths.loc[[1835, 1862, 1863]].index
    confirmed_deaths.drop(indexes, inplace = True)


    return confirmed_deaths


def exploratoryDA(confirmed_cases):
    #checking first few rows to get a sense of data
    print("##############print first 10 rows:#################")
    print(confirmed_cases.head(10))
    #checking last ten rows
    print("##############print last 10 rows:(##############")
    print(confirmed_cases.tail(10))
    #checking the data types of the columns
    print("##############print column data types:##############")
    print(confirmed_cases.dtypes)
    #Gettign a sense of count
    print("#############print column counts:##############")
    print(confirmed_cases.count())
    #checking for nulls
    print("##############Check nulls:##############")
    print(confirmed_cases.isnull().sum()) (edited) 
    
#arrange columns
def createStatePOP(confirmed_cases):     
    #create a pivot table to group data by state to calcualte state population
    df_statepop = confirmed_cases.pivot_table(['population'],['State'],aggfunc='sum').reset_index()   
    return df_statepop


def meltData(confirmed_cases):
     #Unpivot date columns with variable column ‘Date’ and value column ‘Confirmed’
    #saving all dates in a list to use for melting the data frame
    dates = confirmed_cases.columns[6:] 
    df_confirm_long = confirmed_cases.melt(id_vars=['State'], value_vars=dates, var_name='Date', value_name='Confirmed') 
    df_confirm_long.to_csv('df_confirm_long.csv',index=True)      
    #make sure the confirmed cases are numeric and not string
    df_confirm_long['Confirmed'] = pd.to_numeric(df_confirm_long['Confirmed'],errors='coerce')
    #aggregate by state and get max confirmed cases to date
    last_date= dates[-1]
    df_confirm_long = df_confirm_long[df_confirm_long['Date']==last_date].groupby(['State'])['Confirmed'].sum().reset_index()    
   
    return df_confirm_long



#need to run after merge, arrange, melt
def createRate(df_statepop,df2_melt):  
    #merge both population and confirmed cases to calculate rate
    df2_melt = pd.merge(left=df2_melt, right=df_statepop, how='left', left_on=['State'], right_on=['State'])
    df2_melt['Rate'] = (df2_melt['Confirmed']/df2_melt['population'] ) *100
    return df2_melt



#need to run after merge, arrange, melt, create rate    
#Plot geo map for statewie rate = cases/popualtion
def createUSAMap(createRate):   
    map_data= dict(type='choropleth',
                   colorscale='Portland',
                   locations = createRate['State'],
                   locationmode ='USA-states',
                   z = createRate['Rate'],
                   text = createRate['State'],
                   colorbar = {'title': 'Cases/Population%'},
                   marker = dict(line = dict(color = 'rgb(255,255,255)', width=2))
                   )
    map_layout= dict(title = 'Percentage of COVID-19 Confirmed Cases to Date by Popualtion of the State',
                     geo= dict(scope='usa',
                     showlakes=True, # lakes
                     lakecolor='rgb(255, 255, 255)'))
    map_cases = go.Figure(map_data)
    map_cases.update_layout(map_layout)
    map_cases.show()          
    map_cases.write_html("state_rate.html")



#Storing data by Month to use for the heat map
#need to run merge before this
def structureDataTimeSeries(confirmed_cases):
    #drop the columns we don't need and store in a new dataframe
    df_confirm = confirmed_cases.drop(columns=['population'])
    #Unpivot date columns with variable column ‘Date’ and value column ‘Confirmed’
    dates = df_confirm.columns[5:]    
    #use melt function to make data long
    df_confirm_long = df_confirm.melt(id_vars='State', value_vars=dates, var_name='Date', value_name='Confirmed')                                               
    #aggregating by date and state and resetting
    df_confirm_long = df_confirm_long.groupby(['Date', 'State'])['Confirmed'].sum().reset_index()    
    #converting date strings to datetime
    df_confirm_long['Date'] = pd.to_datetime(df_confirm_long['Date'])    
    return df_confirm_long

#need to run merge, arrange, structure before this
def createHeatMap(df_confirm_long):
    ##Getting months with dates
    df_confirm_long['month'] = df_confirm_long['Date'].dt.month_name()    
    #We can also get data by day for further analysis if required.
    #df_confirm['day'] = df_confirm['date'].dt.day_name()        
    #aggregating by month and resetting and storing in a new dataframe
    df_month = df_confirm_long.groupby(['State', 'month'],sort=False)['Confirmed'].max().reset_index()   
    #saving the dataframe for back-up
    df_month.to_csv('df_cases_months.csv',index=True) 
    #creating data for heatmap to show confirmed cases for each state by month
    heatmap_data = pd.pivot_table(df_month, values='Confirmed', index='State', columns='month')    
    #verifying data
    # print(heatmap_data.iloc[0:3, 0:3]) 
    #store months in a list to use for sorting
    months=df_month.month.unique().tolist()        
    #sort columns by months
    heatmap_data = heatmap_data.reindex(columns=months)    
    #customize the heatmap
    plt.figure(figsize=(200,200))
    m = sns.heatmap(heatmap_data,cmap='RdBu_r', robust=True)
    m.set_xticklabels(heatmap_data.columns, rotation=45)
    m.set_yticklabels(heatmap_data.index, rotation=45)    
    m.set_title('Confirmed Cases by month for all the States in USA')
    #save the heatmap
    plt.savefig('heatmap.png', dpi=150)   
    plt.show()
    plt.savefig("heatmapByState.png")
    return plt

def createLinePlot(df_confirm_long):
    # Initialize figure and ax
    fig, ax = plt.subplots()
    states=['TX','CA','FL','AZ','NY']
    compare = df_confirm_long[df_confirm_long['State'].isin(states) ]
    #set parameter markers to True and style='continent'
    ax.set(yscale="log")
    ax.set_title("COVID-19 Cases Trends for AZ, CA, FL, NY, TX")
    sns.lineplot(x='Date', y='Confirmed', data=compare, hue='State', ax=ax, style = 'State', markers = True)
    plt.show()
    
def plot_geomap_counties(cases, deaths):
    
    # get the max cases for each county.  Want to get the max because cases  are  aggregate, not single day,
    # so cases on latest  date  is total number of cases thusfar
    
    last_date= cases.columns.tolist()[-1]
    
    max_cases = cases[['countyFIPS','County Name','State',f'{last_date}']]
    
    deaths = deaths[['countyFIPS',f'{last_date}']]

    deaths_dict = dict(zip(deaths['countyFIPS'], deaths[f'{last_date}']))
    
    max_cases['Deaths'] = max_cases['countyFIPS'].map(deaths_dict)
    
    #  code to plot a heatmap of  the  number of cases using plotly 
    with urlopen('https://raw.githubusercontent.com/plotly/datasets/master/geojson-counties-fips.json') as response:
        counties = json.load(response)
    
    # need to add leading zero to all fips that have 4 characters, because that is how  it is stored in  df pulled with county fips
    #  create empty  dictionary for new fips values
    fips_new = {}
    
    #make fips str so we can add leading zeros and make into list we can loop through
    fips = cases['countyFIPS'].astype(str).tolist()
    
    # loop through  fips list and if length is 4 add a leading  zero.  Add  all updated fips to dict, with key as old fip and 
    # value as new fip
    for f in fips:
        if len(f) == 4:
            f_new = '0' + f
            fips_new[f] = f_new
        else:
            fips_new[f] = f
            
    #  make fips str
    max_cases['countyFIPS'] = max_cases['countyFIPS'].astype(str)
    # map dictionary as new column so that we can use the new fips that we created 
    max_cases['fips'] = max_cases['countyFIPS'].map(fips_new)
    # get all cases where county fip is not zero. Dont want statewide data in this instance
    max_cases = max_cases.loc[max_cases['countyFIPS'] != 0]
    
    
    # plot the heatmap of cases
    fig = px.choropleth_mapbox(max_cases, geojson=counties, locations='fips', color=f'{last_date}',
                                # color_continuous_scale="tempo",
                               range_color=(1, 5000),
                               mapbox_style="carto-positron",
                               zoom=3, center = {"lat": 37.0902, "lon": -95.7129},
                               opacity=0.5,
                               labels={f'{last_date}':'# Cases', 'County Name':'County: ', 'Deaths': '# Deaths'},
                               hover_data=['County Name', 'Deaths']
                              )
    fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
    fig.show()
    


    
def plot_bar(cases, deaths):
    
    # get the max cases for each county.  Want to get the max because cases  are  aggregate, not single day,
    # so cases on latest  date  is total number of cases thusfar
    
    last_date= cases.columns.tolist()[-1]
    
    max_cases = cases[['countyFIPS','County Name','State',f'{last_date}']]
    
    deaths = deaths[['countyFIPS',f'{last_date}']]

    deaths_dict = dict(zip(deaths['countyFIPS'], deaths[f'{last_date}']))
    
    max_cases['Deaths'] = max_cases['countyFIPS'].map(deaths_dict)

    state_counts = max_cases.groupby('State').sum().reset_index()
    
    # rename the columns of each state
    state_counts.columns = ['State', 'CountyFIPS','NumCases', 'NumDeaths']
    # sort the state_counts by number of cases
    state_counts.sort_values(by='NumCases')
    
    # plot the number of cases in bar chart
    #fig = px.bar(state_counts, x='State', y='NumCases')
    #fig.update_layout(title=f'Number of COVID-19 Cases by State as of {last_date}')
    #fig.show()
    
    
    fig = go.Figure(data=[
    go.Bar(name='# Cases', x=state_counts['State'], y=state_counts['NumCases'])
    ])
    # Change the bar mode
    fig.update_layout(barmode='group', title='Number of COVID-19 Cases by State')
    fig.show()
    
    
    fig2 = go.Figure(data=[
    go.Bar(name='# Cases', x=state_counts['State'], y=state_counts['NumDeaths'])
    ])
    # Change the bar mode
    fig2.update_layout(barmode='group', title='Number of COVID-19 Deaths by State')
    fig2.show()

"""
Function to get the differences in cases for the entire dataset
"""
def get_diff_by_state(cases, state_name):
        
    # add try except block here. Also room for testing to make sure that correct state chosen
    state = cases.loc[cases['State'] == str(state_name)]

    # transpose dataframe so dates are rows    
    stateT = state.T
    
    # make county names the columns for transposed data frame 
    stateT.columns=stateT.iloc[1]
    
    # drop statewide counts
    stateT = stateT.drop(columns='Statewide Unallocated')
    
    # drop the first five rows because that is data we are not interested in in this instance
    stateT = stateT.iloc[5:]
    
    # create empty dictionary so we can store the difference in cases for each county in state
    diff_counts = {}

    for col in stateT:
        # get the column and find the difference for each row
        col_cases = stateT[col].diff()
        
        # append new series to dictionary as value with key as county name
        diff_counts[col] = col_cases

    # create dataframe with differences from each state by  day
    diff_counties = pd.DataFrame.from_dict(diff_counts)
    
    return diff_counties


"""
Function to get the differences in cases for the last week
"""
def get_diff_last_week(cases, state_name):
        
    # add try except block here. Also room for testing to make sure that correct state chosen
    state = cases.loc[cases['State'] == state_name]

    # transpose dataframe so dates are rows    
    stateT = state.T
    
    # make county names the columns for transposed data frame 
    stateT.columns=stateT.iloc[1]
    
    # drop statewide counts
    stateT = stateT.drop(columns='Statewide Unallocated')
    
    # drop the first three rows because that is data we are not interested in in this instance
    stateT = stateT.iloc[5:]
    
    # create empty dictionary so we can store the difference in cases for each county in state
    diff_counts = {}

    for col in stateT:
        # get the column and find the difference for each row
        col_cases = stateT[col].diff()
        
        # append new series to dictionary as value with key as county name
        diff_counts[col] = col_cases

    # create dataframe with differences from each state by  day
    diff_counties = pd.DataFrame.from_dict(diff_counts)
    
    # get the date for today so that we can update our data for last week
    today = pd.to_datetime(date.today())
    
    # get the date for a week prior so that we can filter our datetime column
    week = today - pd.Timedelta(days=14)
    
    # reset index so that we can get the Date as a column
    diff_counties = diff_counties.reset_index().rename(columns={'index':'Date'})
    
    # convert the date to pandas datetime format
    diff_counties['Date'] = pd.to_datetime(diff_counties['Date'], infer_datetime_format=True)
    
    # get the data where the date is greater than the last week
    diff_counties = diff_counties.loc[diff_counties['Date'] >= week]
    
    diff_counties.index = pd.to_datetime(diff_counties['Date'], format = '%m/%d/%Y')
    diff_counties.index = diff_counties.index.strftime('%Y-%m-%d')
    
    diff_counties = diff_counties.drop(columns=['Date'])
    
    return diff_counties 
In [5]:
confirmed_cases = scrape_live_data()
confirmed_deaths = scrape_live_data_deaths()

Wide Date

In [6]:
#arrange columns to produce geo map
statePOP = createStatePOP(confirmed_cases)
#melt
meltData_df = meltData(confirmed_cases)
#create cases/popualtion to create map
createRate = createRate(statePOP,meltData_df)

#structure for heat and line plot
df_confirm_long = structureDataTimeSeries(confirmed_cases)

Transformations

Confirmed

Line

HeatMap

In [7]:
#geomap
createUSAMap(createRate)

plot_bar(confirmed_cases, confirmed_deaths)

#Line plot
createLinePlot(df_confirm_long)

HeatMap

Geo Map

In [8]:
plot_geomap_counties(confirmed_cases, confirmed_deaths)

New Cases 1

New Cases 2

Testing

Beyond Specs

Conclusion